/*
// MULTOPP5 - by Paul Hsieh (c) 1997, 1998, All rights reserved.
//
// This program is public domain, subject to the conditions that any use made
// of it that results in a derivative work or product must credit the author,
// Paul Hsieh and that this source never be distributed without these comments
// appearing intact at the top of the program.
//
// MULTOPP5 searches for optimal sequences of adds, subs, shls, leas, to
// simulate a constant multiply on a Pentium. It is assumed that no more
// than two registers are used (likely a somewhat faulty, but necessary
// assumption) and that the intermediate values are bounded by limits of
// 0..n, where n is the maximum multiply factor simulated. The results tend
// to justify the assumptions. It is easy to show that using more than 2
// registers only affects sequences of at least 5 instructions/3 clocks; that
// is to say no sequences given here in 3 clocks or less can be improved for
// total clock count. For numbers up to 2048, there is never a need to go
// beyond 6 clocks; nearly 90% of them running in 5 clocks or fewer.
//
// The ordering used is a lexical one:
//
// (number of clocks, whether or not the V pipe is free, total size)
//
// The first two optimize clock usage, taking into account that its potential
// use inside other code is best served by additional pairing after it has
// completed. There is no heuristic for waiting before writing to the ebx
// register. The code size optimization is good because otherwise the code
// has a tendency to add in extra meaningless instructions that does not
// affect the total clock count, but which is not how any reasonable human
// would code.
//
// Terje Mathisen made the good point that assuming eax is not ready for
// address generation on the first clock would also be a good thing to add. I
// have not looked into this.
//
// The algorithm is a departure from my original one which used an "OpenList"
// and simply grew it incrementally as required. The present algorithm is
// simply a depth first search through the code space, pruning code sequences
// that are less efficient than code sequences already encountered. The
// search is iteratively deepened by maximum (clock,V pipe state) allowed by
// minimal granular steps so that nothing is missed along the search, and so
// the search path can find suboptimal paths before wasting unpredictable
// resources on them. A table of "best encounters so far" is maintained.
// The size of this table, so far, appears to be the biggest limiting factor
// as I can produce results up to 2048 (up from 512; my 1993 result) easily,
// but 4096 is completely undoable with 32 MB of RAM. Terje Mathisen was able
// to reach 4096 on a 64MB equipped DEC alpha, and indeed pushed it up to
// 5040. However, searches up to non-power of 2s are suspect since the
// corresponding left shift over that number is not present in the search
// space of partial results.
//
// Besides leading to optimal p5 multiplies (something of diminishing value
// as the x86 series gets faster in subsequent iterations) I am hoping this
// example of "finding optimal code sequences" germinates discussion on
// optimal code generation in general.
//
// Thanks to Jeremy McDonald, Robert Prins and Bjorn Stromvall for pointing
// out a couple obvious flaws with the first version of this program (some of
// the instructions I was using had obvious faster or shorter equivalents.)
*/
#include <stdio.h>
typedef unsigned char byte;
typedef struct {
int CodeLength;
int Weight[4];
byte CodeFrag[8];
byte cost;
byte p5flags;
char * Name;
} StepType;
/* Pentium pairing bitflags */
#define UV 0xC0
#define PU 0x80
#define PV 0x40
#define NP 0x00
#define AB 0x03
#define Ax 0x02
#define xB 0x01
#define xx 0x00
/* ... including potential AGI stalls */
#define ab (0x0C|AB)
#define ax (0x08|Ax)
#define xb (0x04|xB)
#define MBS 2
#define P5Attr(pairing,readreg,writereg) ((pairing)|((readreg)<<MBS)|(writereg))
/*
The instruction characteristics. The Weight[] vector is really a 2x2
matrix that and (EAX,EBX) vector is multiplied by to calculate the effect
of the instruction on these registers.
In another version of this program I actually used the code fragments to
compile up the resulting subroutines to test their correctness.
The cost instruction field was to be used for encompassing multi-clock
instructions (I wrote the original version before Pentium's ever existed.)
Of course none of these instructions are multi-clock. Hmmm ... now that I
think of is, technically, I could put 'xchg eax,ebx' in here, but I highly
doubt it would ever be used.
The p5flags field encode the pairability of the instruction, the register
read requirements and the register write requirements.
The name field just gives what the instruction mneumonic is, so that the
program can output results in a readable form.
*/
StepType Step[] = {
{2, {1, 0, 1, 0}, { 0x89, 0xc3 } , 1, P5Attr(UV,Ax,xB), " mov ebx,eax " },
{2, {0, 1, 0, 1}, { 0x89, 0xd8 } , 1, P5Attr(UV,xB,Ax), " mov eax,ebx " },
{2, {0, 0, 0, 1}, { 0x31, 0xc0 } , 1, P5Attr(UV,Ax,Ax), " xor eax,eax " },
{2, {1, 1, 0, 1}, { 0x01, 0xd8 } , 1, P5Attr(UV,AB,Ax), " add eax,ebx " },
{2, {1, 0, 1, 1}, { 0x01, 0xc3 } , 1, P5Attr(UV,AB,xB), " add ebx,eax " },
{2, {1,-1, 0, 1}, { 0x29, 0xd8 } , 1, P5Attr(UV,AB,Ax), " sub eax,ebx " },
{2, {1, 0,-1, 1}, { 0x29, 0xc3 } , 1, P5Attr(UV,AB,xB), " sub ebx,eax " },
{2, {2, 0, 0, 1}, { 0x01, 0xc0 } , 1, P5Attr(UV,Ax,Ax), " add eax,eax " },
{2, {1, 0, 0, 2}, { 0x01, 0xdb } , 1, P5Attr(UV,xB,xB), " add ebx,ebx " },
{3, { 4, 0, 0, 1}, { 0xc1, 0xe0, 0x02 }, 1, P5Attr(PU,Ax,Ax), " shl eax,02H " },
{3, { 8, 0, 0, 1}, { 0xc1, 0xe0, 0x03 }, 1, P5Attr(PU,Ax,Ax), " shl eax,03H " },
{3, { 16, 0, 0, 1}, { 0xc1, 0xe0, 0x04 }, 1, P5Attr(PU,Ax,Ax), " shl eax,04H " },
{3, { 32, 0, 0, 1}, { 0xc1, 0xe0, 0x05 }, 1, P5Attr(PU,Ax,Ax), " shl eax,05H " },
{3, { 64, 0, 0, 1}, { 0xc1, 0xe0, 0x06 }, 1, P5Attr(PU,Ax,Ax), " shl eax,06H " },
{3, {128, 0, 0, 1}, { 0xc1, 0xe0, 0x07 }, 1, P5Attr(PU,Ax,Ax), " shl eax,07H " },
{3, {256, 0, 0, 1}, { 0xc1, 0xe0, 0x08 }, 1, P5Attr(PU,Ax,Ax), " shl eax,08H " },
{3, {512, 0, 0, 1}, { 0xc1, 0xe0, 0x09 }, 1, P5Attr(PU,Ax,Ax), " shl eax,09H " },
{3,{1024, 0, 0, 1}, { 0xc1, 0xe0, 0x0A }, 1, P5Attr(PU,Ax,Ax), " shl eax,0aH " },
{3,{2048, 0, 0, 1}, { 0xc1, 0xe0, 0x0B }, 1, P5Attr(PU,Ax,Ax), " shl eax,0bH " },
{3, {3, 0, 0, 1}, { 0x8d, 0x04, 0x40 } , 1, P5Attr(UV,ax,Ax), " lea eax,[eax+eax*2] " },
{3, {5, 0, 0, 1}, { 0x8d, 0x04, 0x80 } , 1, P5Attr(UV,ax,Ax), " lea eax,[eax+eax*4] " },
{3, {9, 0, 0, 1}, { 0x8d, 0x04, 0xc0 } , 1, P5Attr(UV,ax,Ax), " lea eax,[eax+eax*8] " },
{3, {1, 0, 0, 4}, { 0xc1, 0xe3, 0x02 }, 1, P5Attr(PU,xB,xB), " shl ebx,02H " },
{3, {1, 0, 0, 8}, { 0xc1, 0xe3, 0x03 }, 1, P5Attr(PU,xB,xB), " shl ebx,03H " },
{3, {1, 0, 0, 16}, { 0xc1, 0xe3, 0x04 }, 1, P5Attr(PU,xB,xB), " shl ebx,04H " },
{3, {1, 0, 0, 32}, { 0xc1, 0xe3, 0x05 }, 1, P5Attr(PU,xB,xB), " shl ebx,05H " },
{3, {1, 0, 0, 64}, { 0xc1, 0xe3, 0x06 }, 1, P5Attr(PU,xB,xB), " shl ebx,06H " },
{3, {1, 0, 0, 128}, { 0xc1, 0xe3, 0x07 }, 1, P5Attr(PU,xB,xB), " shl ebx,07H " },
{3, {1, 0, 0, 256}, { 0xc1, 0xe3, 0x08 }, 1, P5Attr(PU,xB,xB), " shl ebx,08H " },
{3, {1, 0, 0, 512}, { 0xc1, 0xe3, 0x09 }, 1, P5Attr(PU,xB,xB), " shl ebx,09H " },
{3, {1, 0, 0,1024}, { 0xc1, 0xe3, 0x0a }, 1, P5Attr(PU,xB,xB), " shl ebx,0aH " },
{3, {1, 0, 0,2048}, { 0xc1, 0xe3, 0x0b }, 1, P5Attr(PU,xB,xB), " shl ebx,0bH " },
{3, {1, 0, 0, 3}, { 0x8d, 0x1c, 0x5b } , 1, P5Attr(UV,xb,xB), " lea ebx,[ebx+ebx*2] " },
{3, {1, 0, 0, 5}, { 0x8d, 0x1c, 0x9b } , 1, P5Attr(UV,xb,xB), " lea ebx,[ebx+ebx*4] " },
{3, {1, 0, 0, 9}, { 0x8d, 0x1c, 0xdb } , 1, P5Attr(UV,xb,xB), " lea ebx,[ebx+ebx*8] " },
{3, {1, 2, 0, 1}, { 0x8d, 0x04, 0x58 } , 1, P5Attr(UV,ab,Ax), " lea eax,[eax+ebx*2] " },
{3, {1, 4, 0, 1}, { 0x8d, 0x04, 0x98 } , 1, P5Attr(UV,ab,Ax), " lea eax,[eax+ebx*4] " },
{3, {1, 8, 0, 1}, { 0x8d, 0x04, 0xd8 } , 1, P5Attr(UV,ab,Ax), " lea eax,[eax+ebx*8] " },
{3, {1, 0, 1, 2}, { 0x8d, 0x1c, 0x58 } , 1, P5Attr(UV,ab,xB), " lea ebx,[eax+ebx*2] " },
{3, {1, 0, 1, 4}, { 0x8d, 0x1c, 0x98 } , 1, P5Attr(UV,ab,xB), " lea ebx,[eax+ebx*4] " },
{3, {1, 0, 1, 8}, { 0x8d, 0x1c, 0xd8 } , 1, P5Attr(UV,ab,xB), " lea ebx,[eax+ebx*8] " },
{3, {2, 1, 0, 1}, { 0x8d, 0x04, 0x43 } , 1, P5Attr(UV,ab,Ax), " lea eax,[ebx+eax*2] " },
{3, {4, 1, 0, 1}, { 0x8d, 0x04, 0x83 } , 1, P5Attr(UV,ab,Ax), " lea eax,[ebx+eax*4] " },
{3, {8, 1, 0, 1}, { 0x8d, 0x04, 0xc3 } , 1, P5Attr(UV,ab,Ax), " lea eax,[ebx+eax*8] " },
{3, {1, 0, 2, 1}, { 0x8d, 0x1c, 0x43 } , 1, P5Attr(UV,ab,xB), " lea ebx,[ebx+eax*2] " },
{3, {1, 0, 4, 1}, { 0x8d, 0x1c, 0x83 } , 1, P5Attr(UV,ab,xB), " lea ebx,[ebx+eax*4] " },
{3, {1, 0, 8, 1}, { 0x8d, 0x1c, 0xc3 } , 1, P5Attr(UV,ab,xB), " lea ebx,[ebx+eax*8] " },
{3, {0, 2, 0, 1}, { 0x8d, 0x04, 0x1b } , 1, P5Attr(UV,xb,Ax), " lea eax,[ebx+ebx] " },
{3, {0, 3, 0, 1}, { 0x8d, 0x04, 0x5b } , 1, P5Attr(UV,xb,Ax), " lea eax,[ebx+ebx*2] " },
{3, {0, 5, 0, 1}, { 0x8d, 0x04, 0x9b } , 1, P5Attr(UV,xb,Ax), " lea eax,[ebx+ebx*4] " },
{3, {0, 9, 0, 1}, { 0x8d, 0x04, 0xdb } , 1, P5Attr(UV,xb,Ax), " lea eax,[ebx+ebx*8] " },
{3, {1, 0, 2, 0}, { 0x8d, 0x1c, 0x00 } , 1, P5Attr(UV,ax,xB), " lea ebx,[eax+eax] " },
{3, {1, 0, 3, 0}, { 0x8d, 0x1c, 0x40 } , 1, P5Attr(UV,ax,xB), " lea ebx,[eax+eax*2] " },
{3, {1, 0, 5, 0}, { 0x8d, 0x1c, 0x80 } , 1, P5Attr(UV,ax,xB), " lea ebx,[eax+eax*4] " },
{3, {1, 0, 9, 0}, { 0x8d, 0x1c, 0xc0 } , 1, P5Attr(UV,ax,xB), " lea ebx,[eax+eax*8] " },
{7, {0, 4, 0, 1}, { 0x8d, 0x04, 0x9d, 0x00, 0x00, 0x00, 0x00 }, 1, P5Attr(UV,xb,Ax), " lea eax,[ebx*4+0H] " },
{7, {0, 8, 0, 1}, { 0x8d, 0x04, 0xdd, 0x00, 0x00, 0x00, 0x00 }, 1, P5Attr(UV,xb,Ax), " lea eax,[ebx*8+0H] " },
{7, {1, 0, 4, 0}, { 0x8d, 0x1c, 0x85, 0x00, 0x00, 0x00, 0x00 }, 1, P5Attr(UV,ax,xB), " lea ebx,[eax*4+0H] " },
{7, {1, 0, 8, 0}, { 0x8d, 0x1c, 0xc5, 0x00, 0x00, 0x00, 0x00 }, 1, P5Attr(UV,ax,xB), " lea ebx,[eax*8+0H] " },
{7, {4, 0, 0, 1}, { 0x8d, 0x04, 0x85, 0x00, 0x00, 0x00, 0x00 }, 1, P5Attr(UV,ax,Ax), " lea eax,[eax*4+0H] " },
{7, {8, 0, 0, 1}, { 0x8d, 0x04, 0xc5, 0x00, 0x00, 0x00, 0x00 }, 1, P5Attr(UV,ax,Ax), " lea eax,[eax*8+0H] " },
{7, {1, 0, 0, 4}, { 0x8d, 0x1c, 0x9d, 0x00, 0x00, 0x00, 0x00 }, 1, P5Attr(UV,xb,xB), " lea ebx,[ebx*4+0H] " },
{7, {1, 0, 0, 8}, { 0x8d, 0x1c, 0xdd, 0x00, 0x00, 0x00, 0x00 }, 1, P5Attr(UV,xb,xB), " lea ebx,[ebx*8+0H] " },
{0, {1, 0, 0, 1}, { 0x90, 0x90, 0x90 } } , -1, P5Attr(NP,AB,AB), " Sentinel " };
/*
Declaring ClockCountType as a float allows me to put a hack for the
total code length (otherwise a lot of silly results show up, where
dead clocks are filled with irrelevant instructions.) However, as the
whole algorithm is limited by the "SpaceCost" array size, it can make
sense to make this a short, or indeed a char. I have only experimented
a little here.
*/
typedef float ClockCountType;
/*
The pipe state structure is what I use to model the pentium's internal
resource contentions. It basically keeps track of the clocks consumed
so far as well as usage bits for RAW (read after write) and WAW (write
after write) dependencies as well as potential AGI contentions from the
previous clock.
*/
typedef struct {
ClockCountType clocks;
unsigned char freepipe;
unsigned char contentionmask;
} PipeStateStructType;
#define UFREE PU
#define VFREE PV
void InitPSS(PipeStateStructType * pss) {
pss->clocks=0;
pss->freepipe = UFREE;
pss->contentionmask = 0;
}
/*
Model Pentium scheduling as best as I know it, to count clocks. Also
adds in tiny fractional fudge clocks for total code length.
*/
PipeStateStructType * AddInstruction(PipeStateStructType * pss, int inst) {
static PipeStateStructType new_pss;
char imask;
ClockCountType iclocks;
int RAW,AGI,i,j;
imask = Step[inst].p5flags;
iclocks = Step[inst].cost;
/* Contentions on current instruction, work out first clock */
new_pss = *pss;
/* Is there a RAW, WAW or AGI contention with the previous state? */
j = ((((imask&AB)<<MBS)|imask) & pss->contentionmask)&(~(UV|AB)); /* ?AW */
i = (imask & pss->contentionmask)&(~(UV|AB)); /* RAW */
/* Is there an AGI stall? */
if( (i & (AB<<(2*(MBS))))!=0 ) { /* RAW on address? => AGI stall */
new_pss.clocks+=1;
i &= ~(AB<<(2*(MBS))); /* Doesn't affect issue pairing */
}
/* Does this instruction pair? And if so is it a free clock or not? */
if( (0!=j) || ((imask & (pss->freepipe))==0) ) { /* WAW/RAW upariable */
new_pss.clocks+=1;
new_pss.freepipe = UFREE;
if( imask & PU ) new_pss.freepipe = VFREE;
} else {
new_pss.freepipe ^= (UFREE ^ VFREE);
if( new_pss.freepipe==VFREE ) new_pss.clocks+=1;
}
/* The potential extra clocks on this instruction */
new_pss.clocks+=(iclocks-1);
/* Work out end of clock conditions; Available for next pair? Extra
shadowed delay? */
RAW=0; /* No RAW on next inst? */
if( new_pss.clocks!=pss->clocks )
RAW=imask & AB; /* V pipe dependencies on next inst */
AGI = 0; /* No stall after multiple clocks? */
if( new_pss.clocks==pss->clocks )
AGI = ((pss->contentionmask)>>MBS); /* AGI on previous writes */
else if( iclocks<2 ) {
AGI = (pss->contentionmask)>>(2*(MBS)); /* AGI from pairing */
}
AGI = (AGI|imask) & AB; /* AGI on current writes as well */
/* These are the contention flags */
new_pss.contentionmask = ((AGI<<MBS)|RAW)<<MBS;
/* Instruction length fudge factor (smaller code is better) */
new_pss.clocks += (Step[inst].CodeLength)/1024.0;
return &new_pss;
}
/* Note that the limit has been set to 512 so that it can run on even modest
PCs. Change the number to a higher value to get more results (though you
will need a quadratically increased amount of memory for the program to
complete in a reasonable amount of time.) */
#define Limit 2048
#define ValLimit (Limit+2)
#define Invalid 0x5AAA
#define MAXCLOCKS 255
#define MaxPathLength 12
int Path[MaxPathLength]; /* Yeah, like we'll ever be able to do 12 ... */
unsigned char Done[Limit+2];
ClockCountType Best[Limit+2];
ClockCountType SpaceCost[Limit+1][Limit+2];
unsigned int Reported = 0;
int FragmentLength[Limit+2];
unsigned char Fragment[Limit+2][MaxPathLength];
void InitSpaceState() {
int i,j;
for(i=0;i<=Limit;i++) {
for(j=0;j<=Limit+1;j++) {
SpaceCost[i][j] = MAXCLOCKS;/* Unrealistic maximum total clocks */
}
}
for(i=0;i<=Limit+1;i++) {
Done[i] = 0; /* Not yet output */
Best[i] = MAXCLOCKS;
}
}
/* Do single component matrix multiply */
short WeightedSum( short A, short B, short Aw, short Bw ) {
if( ((Aw!=0) && A ==Invalid) ) return(Invalid);
if( (( A!=0) && Aw==Invalid) ) return(Invalid);
if( ((Bw!=0) && B ==Invalid) ) return(Invalid);
if( (( B!=0) && Bw==Invalid) ) return(Invalid);
return( A*Aw+B*Bw );
}
/* Do vector x matrix multiply and track "invalid" assignments */
int ApplyInst( short *A, short *B, int I ) {
int a,b;
a = WeightedSum( *A, *B, Step[I].Weight[0], Step[I].Weight[1] );
b = WeightedSum( *A, *B, Step[I].Weight[2], Step[I].Weight[3] );
if( a<0 || a>Limit ) a = Invalid;
if( b<0 || b>Limit ) b = Invalid;
/* No sense in turning a good value to a bad value */
if( (a==Invalid) && ((*A)!=Invalid) ) return 0;
if( (b==Invalid) && ((*B)!=Invalid) ) return 0;
*A = a;
*B = b;
return 1;
}
void GiveResult(int i) {
int j;
printf("Mul%04d:\t; // %d clocks\n",i,(int)(Best[i])/2);
for(j=0;j<FragmentLength[i];j++)
printf(" %s\n",Step[ Fragment[i][j] ].Name);
Done[i]=1;
Reported++;
}
/*
Hacked sentinel indicated a useless node (already been exhaustively
searched).
Terje Mathisen modified a version of this code to remove the need for
this hack by simply testing the A, B values for any out of range result.
It confirms that using a single out of range value, as I have done is
also sufficient. I did not merge with Terje's code, since it did not
substantially improve functionality or performance.
*/
#define USELESS_PATH -13666
/*
Counts the potential improvments as they happen. Used for pruning
search paths which offer nothing else for future search iterations.
*/
static int PotentialImprovements=0;
/*
The main search routine. It starts with obvious exit conditions (if the
current path is obsolete or if this path is slower than some previously
searched path.) It then updates the best cost if there has been an
improvement. If the cost exceeds the current cost iteration, it is marked
as a potential for a later deeper search and aborted. If the path length
is too long (this has never happened) it also exits.
From here the main loop is entered and continued by adding each possible
instruction one at a time and calling itself recursively. If after the
search no potential improvments for later search depths are found, then
the search is essentially exhaustive, and therefore there is nothing more
to be gained in searching it again in future interations. This pruning
step improved the search performance by about 17% in tests I ran.
*/
void SearchInstructionSpace( int A, int B, PipeStateStructType *ps, int d, int cmax ) {
PipeStateStructType p;
int i, picount;
ClockCountType cost;
short aa,bb;
if( B==Invalid ) B = Limit+1;
if( SpaceCost[A][B]==USELESS_PATH ) return;
picount = PotentialImprovements;
cost = (ps->clocks);
cost = cost * 2;
if( ps->freepipe==UFREE ) cost = cost+1; // Penalty for occupied V slot.
if( d!=0 ) {
if( SpaceCost[A][B] < cost ) return; // Slower solution?
if( SpaceCost[A][B] > cost ) {
SpaceCost[A][B] = cost;
if( cost<Best[A] ) {
Best[A]=cost;
FragmentLength[A]=d;
i=0;
do {
Fragment[A][i] = Path[i];
i++;
} while(i<d);
}
}
}
if( B==Limit+1 ) B = Invalid;
if( cost>cmax ) {
PotentialImprovements++;
return;
}
if( d>=MaxPathLength ) return;
i=0;
do {
Path[d]=i;
p = *AddInstruction(ps,i);
aa=A;
bb=B;
if( 1==ApplyInst( &aa, &bb, i ) )
SearchInstructionSpace(aa,bb,&p,d+1,cmax);
i++;
} while(Step[i].cost>0);
if( picount==PotentialImprovements ) {
if( B==Invalid ) {
B = Limit+1;
if( A==1 ) return;
}
SpaceCost[A][B]=USELESS_PATH; // Prune node, nothing more to search.
}
}
/*
Reports on searches after each iteration. Just search for filled entries in
the Best[] array that have not been already marked in the Done[] array.
*/
void GetReports(int c) {
int i,j;
for(i=0;i<=Limit;i++) {
if( Done[i]==0 && Best[i]<=c ) {
printf("Mul%04d:\t; // %d clocks\n",i,(int)(Best[i])/2);
for(j=0;j<FragmentLength[i];j++)
printf(" %s\n",Step[ Fragment[i][j] ].Name);
Done[i]=1;
Reported++;
}
}
}
void main(void) {
PipeStateStructType p,q;
int i=0;
int t=0;
InitSpaceState();
InitPSS(&p);
/*
Iterate through a granularity of possible increasing costs up to but not
including the last lexical entry (which is taken care of via the search
itself.) This assumes the cost never decreases as instructions are added
which, of course, is true.
*/
for(i=0;Reported<=Limit && i<MaxPathLength*2;i++) {
SearchInstructionSpace( 1, Invalid, &p, 0, i );
GetReports(i);
}
}